Question 1

a)

knitr::include_graphics("hw2_q1.png")

b)

We use \(\chi^2_{2,0.5}\) to indicate that the density contour will have 50% of the probability, and this value will equal \(c^2\). We know our major axis is \(c\sqrt{\lambda_1}e_1\) and our minor axis is \(c\sqrt{\lambda_2}e_2\) from the center, where \(\lambda_1\) and \(\lambda_2\) are eigenvalues of the covariance matrix and \(\lambda_1 > \lambda_2\).

c_squared <- qchisq(0.5,2)
c <- sqrt(c_squared)
covariance <- rbind(c(2, -3*sqrt(3)/5), c(-3*sqrt(3)/5, 1.5))
eigen(covariance)
## eigen() decomposition
## $values
## [1] 2.8188779 0.6811221
## 
## $vectors
##            [,1]       [,2]
## [1,] -0.7854585 -0.6189143
## [2,]  0.6189143 -0.7854585
major_axis <- c*sqrt(eigen(covariance)$values[1])*eigen(covariance)$vectors[,1]
minor_axis <- c*sqrt(eigen(covariance)$values[2])*eigen(covariance)$vectors[,2]

# these are vectors from the center
major_axis
## [1] -1.552706  1.223479
minor_axis
## [1] -0.6014101 -0.7632441

c)

Now, we plot the constant-density contour that contains 50% of the probability.

library(MASS)
library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
set.seed(605574052)
mean <- c(3,1)
ellipse <- t(t(ellipse(covariance, level=0.5, npoints=1000)) + mean)
# create a random sample of 3000 observations following our MVN distribution
X <- mvrnorm(n=3000, mu=mean, Sigma=covariance)
plot(X, xlab = "X", ylab = "Y", asp=0.9)
lines(ellipse, col="blue")
points(mean[1], mean[2], col="blue")
segments(mean[1], mean[2], mean[1]+major_axis[1], mean[2]+major_axis[2], col="blue")
segments(mean[1], mean[2], mean[1]+minor_axis[1], mean[2]+minor_axis[2], col="blue")

Question 2

knitr::include_graphics("hw2_q2_1.png")

knitr::include_graphics("hw2_q2_2.png")

Question 3

knitr::include_graphics("hw2_q3_1.png")

knitr::include_graphics("hw2_q3_2.png")

Question 4

knitr::include_graphics("hw2_q4_1.png")

knitr::include_graphics("hw2_q4_2.png")

Question 5

a)

x_1 <- c(1, 1, 3, 3, 4, 5, 5, 7, 9, 10, 13)
x_2 <- c(17.95, 19, 17.85, 16.5, 14.1, 12.15, 11, 9.35, 6, 4.15, 2.99)
X <- cbind(x_1, x_2)

# getting our sample mean and covariance matrix
mu <- c(mean(x_1), mean(x_2))
sigma <- cov(X)

Now, we calculate the squared statistical distances using the formula \((x_i-\bar{x})^TS^{-1}(x_i-\bar{x})\).

# the distances are in order
dist <- data.frame(x=numeric(), y=numeric())
dist_func <- function(X, mu, sigma) {
  for (i in 1:nrow(X)) {
    x <- i
    y <- diag(t(X[i,] - mu) %*% solve(sigma) %*% (X[i,] - mu))
    dist <- rbind(dist, data.frame(x=x, y=y))
  }
  dist
}

output <- dist_func(X, mu, sigma)
output
##     x         y
## 1   1 1.6457872
## 2   2 1.5278722
## 3   3 3.5264271
## 4   4 0.9103886
## 5   5 0.1662493
## 6   6 0.2185317
## 7   7 1.8649088
## 8   8 0.2630483
## 9   9 1.2769104
## 10 10 2.3281935
## 11 11 6.2716829

b)

output[,2] < qchisq(0.5,2)
##  [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE

We observe that 5 out of the 11 observations are less than the chi-squared value above, meaning these 5 points will fall within the estimated 50% probability contour of a bivariate normal distribution.

c)

ordered_dist <- output[order(output$y),]
plot(qchisq((1:11-0.5)/11, 2), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")

d)

According to the above parts, we have that the data is approximately bivariate normal because about half of our observations (\(\frac{5}{11}\)) lie within the 50% probability contour, indicating a good bivariate normality of our observations. Moreover, observing the chi-square plot, the points seems to loosely follow the curve you would normally see in a chi-square plot, supporting the normality claim.

Question 6

ceramic <- read.csv("Chemical Composion of Ceramic.csv")
head(ceramic)
##   Ceramic.Name Part Na2O  MgO Al2O3  SiO2  K2O  CaO TiO2 Fe2O3 MnO CuO ZnO PbO2
## 1      FLQ-1-b Body 0.62 0.38 19.61 71.99 4.84 0.31 0.07  1.18 630  10  70   10
## 2      FLQ-2-b Body 0.57 0.47 21.19 70.09 4.98 0.49 0.09  1.12 380  20  80   40
## 3      FLQ-3-b Body 0.49 0.19 18.60 74.70 3.47 0.43 0.06  1.07 420  20  50   50
## 4      FLQ-4-b Body 0.89 0.30 18.01 74.19 4.01 0.27 0.09  1.23 460  20  70   60
## 5      FLQ-5-b Body 0.03 0.36 18.41 73.99 4.33 0.65 0.05  1.19 380  40  90   40
## 6      FLQ-6-b Body 0.62 0.18 18.82 73.79 4.28 0.30 0.04  0.96 350  20  80   10
##   Rb2O SrO Y2O3 ZrO2 P2O5
## 1  430   0   40   80   90
## 2  430 -10   40  100  110
## 3  380  40   40   80  200
## 4  380  10   40   70  210
## 5  360  10   30   80  150
## 6  390  10   40   80  130

a)

# we observe the marginal normality first:
qqnorm(ceramic$MnO)

qqnorm(ceramic$CuO)

qqnorm(ceramic$ZrO2)

qqnorm(ceramic$P2O5)

Observing the four qq-plots for the four variables we are interested in, we notice that the first and the last plots hint at a non-normality of the univariate data as the points in those plots do not really form a straight line. However, in the second and the third plots, there is a better linear association.

# now, we observe the multivariate normality:
X <- cbind(ceramic$MnO, ceramic$CuO, ceramic$ZrO2, ceramic$P2O5)
mu <- c(mean(ceramic$MnO), mean(ceramic$CuO), mean(ceramic$ZrO2), mean(ceramic$P2O5))
sigma <- cov(X)

output <- dist_func(X, mu, sigma)
ordered_dist <- output[order(output$y),]
plot(qchisq((1:88-0.5)/88, 4), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")

Looking at the above chi-square plot, there doesn’t seem to be good multivariate normality either because the points do not really follow the expected curve that we would normally spot in a chi-square plot. But, perhaps this plot would look better if we got rid of some of the outliers? This will be explained in part b).

b)

First, we will try to remove some of the outliers (points with a squared distance \(>15\)), then plot the chi-square plot again.

# the outliers are:
ordered_dist[86:88,]
##     x        y
## 83 83 16.38079
## 43 43 17.05710
## 52 52 19.66541
# now we remove these points:
ordered_dist <- ordered_dist[1:85,]
plot(qchisq((1:85-0.5)/85, 4), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")

This certainly is a better looking plot to analyze. Here, we notice a plot that resembles the desired shape so this shows good multivariate normality. Now, we will perform Box-Cox transformation for the univariate data:

pwr1 <- MASS::boxcox(lm(ceramic$MnO~1))

pwr4 <- MASS::boxcox(lm(ceramic$P2O5~1))

# the following will return the values that maximizes the log-likelihood
lambda_1 <- pwr1$x[which.max(pwr1$y)]
lambda_4 <- pwr4$x[which.max(pwr4$y)]

Now, we create a new set of qqplots based on the derived numbers above.

qqnorm((ceramic$MnO^lambda_1 - 1) / lambda_1)

qqnorm((ceramic$P2O5^lambda_4 - 1) / lambda_4)

In the modified qq-plots above, there seems to be a better linear trend, indicating a better normality of the univariate data.

Question 7

a)

library(ICSNP)
## Loading required package: mvtnorm
## Loading required package: ICS
data(LASERI)
head(LASERI)
##      Sex Age Height Weight Waist   Hip PWVT3 PWVT4  BMI   WHR HRT1 HRT2 HRT3
## 1   Male  33    181     88  86.9 102.0  17.1  10.9 26.9 0.852   75   80   91
## 2 Female  30    152     58  76.5  91.5  13.2   7.4 25.1 0.836   65   65   82
## 3 Female  33    174    104  85.4 114.5  13.3   8.1 34.4 0.746   67   73   79
## 4 Female  36    168     81  90.6 107.7  16.0  16.0 28.7 0.841   61   75   74
## 5 Female  36    172     90  88.1 107.8  15.8   9.0 30.4 0.817   68   76   80
## 6 Female  33    165    101 124.7 118.1  10.8   8.8 37.1 1.056   57   73   67
##   HRT4 COT1 COT2 COT3 COT4 SVRIT1 SVRIT2 SVRIT3 SVRIT4 PWVT1 PWVT2 HRT1T4
## 1   64 7.02 5.92 6.21 5.67   2228   2983   2196   2698  10.6  19.7     11
## 2   65 5.60 3.74 4.07 5.27   1827   3085   2981   1957   7.2  11.6      0
## 3   66 7.55 5.32 5.75 6.19   1946   2828   2676   2353   7.0  11.1      1
## 4   74 6.03 4.39 4.75 4.75   2103   3024   3118   2690   9.1  16.7    -13
## 5   64 7.98 5.05 5.05 6.91   1757   2928   2882   2062   9.0  15.6      4
## 6   49 4.23 4.17 3.82 3.51   3251   3718   4035   3886   8.3  11.4      8
##   COT1T4 SVRIT1T4 PWVT1T4 HRT1T2 COT1T2 SVRIT1T2 PWVT1T2
## 1   1.35     -470    -0.3     -5   1.10     -755    -9.1
## 2   0.33     -130    -0.2      0   1.86    -1258    -4.4
## 3   1.36     -407    -1.1     -6   2.23     -882    -4.1
## 4   1.28     -587    -6.9    -14   1.64     -921    -7.6
## 5   1.07     -305     0.0     -8   2.93    -1171    -6.6
## 6   0.72     -635    -0.5    -16   0.06     -467    -3.1
# we observe the marginal normality first:
qqnorm(LASERI$HRT1)

qqnorm(LASERI$HRT2)

qqnorm(LASERI$HRT3)

qqnorm(LASERI$HRT4)

From the above four qq-plots, we notice that all the qq-plots seem to hint at good normality as most points follow a straight linear trend. So, we move on to observe the multivariate normality by constructing a chi-square plot.

X <- cbind(LASERI$HRT1, LASERI$HRT2, LASERI$HRT3, LASERI$HRT4)
mu <- c(mean(LASERI$HRT1), mean(LASERI$HRT2), mean(LASERI$HRT3), mean(LASERI$HRT4))
sigma <- cov(X)

output <- dist_func(X, mu, sigma)
ordered_dist <- output[order(output$y),]
plot(qchisq((1:223-0.5)/223, 4), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")

There clearly seems to be 2 outliers that are messing with the shape of the above plot. Thus, we will locate and remove these outliers and plot the chi-square plot again.

# the outliers are:
ordered_dist[c(222,223),]
##       x        y
## 66   66 33.31074
## 219 219 53.67008
# now we remove these points:
ordered_dist <- ordered_dist[1:221,]
plot(qchisq((1:221-0.5)/221, 4), ordered_dist[,2], xlab="Chi-Square Quantiles", ylab="Squared Distances")

Our chi-square here looks a lot better in terms of multivariate normality.

b)

We will perform the Hotelling Test now that our data is approximately multivariate normal.

HotellingsT2(LASERI[12:15])
## 
##  Hotelling's one sample T2-test
## 
## data:  LASERI[12:15]
## T.2 = 3350.5, df1 = 4, df2 = 219, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to c(0,0,0,0)

Here, we have tested the null \(H_0: \mu = (0,0,0,0)^T\) and our alternative hypothesis \(H_a: \mu \neq (0,0,0,0)^T\). As our p-value is clearly less than 0.05, we reject the null hypothesis and claim that the multivariate normal means are not the same.